pacman::p_load(ggplot2, dplyr, RMySQL, lubridate, psych, tidyr, plotly, padr, imputeTS, fpp2)
## also installing the dependency 'fma'
##
## The downloaded binary packages are in
## /var/folders/cj/pkfd827d2hb4v282d82v3cch0000gn/T//RtmpnSqV0G/downloaded_packages
##
## fpp2 installed
#remove scientific notation in r
options(scipen=999)
## Create a database connection
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!', dbname='dataanalytics2018', host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')
#Query
yr_2006 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2006")
yr_2007 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2007")
yr_2008 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2008")
yr_2009 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2009")
yr_2010 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1, Sub_metering_2, Sub_metering_3 FROM yr_2010")
#Combine tables into one dataframe
All_Years <- bind_rows(yr_2007, yr_2008, yr_2009, yr_2010)
#PREPROCESSING
## Combine Date and Time attribute values in a new attribute column with Paste
All_Years <-cbind(All_Years,paste(All_Years$Date,All_Years$Time), stringsAsFactors=FALSE)
## Give the new attribute in the 6th column a header name change the name
colnames(All_Years)[6] <-"DateTime"
## And move the DateTime attribute within the dataset
All_Years <- All_Years[,c(ncol(All_Years), 1:(ncol(All_Years)-1))]
## Convert DateTime from POSIXlt to POSIXct
All_Years$DateTime <- as.POSIXct(All_Years$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown
## timezone '%Y/%m/%d %H:%M:%S'
## Add the time zone
attr(All_Years$DateTime, "tzone") <- "GMT+0"
#Separate daytime in different attributes
All_Years$year <- year(All_Years$DateTime)
All_Years$month <- month(All_Years$DateTime)
All_Years$weekday <- weekdays(All_Years$DateTime)
All_Years$day <- day(All_Years$DateTime)
All_Years$hour <- hour(All_Years$DateTime)
All_Years$minute <- minute(All_Years$DateTime)
#Rename Sub_meterings columns
names(All_Years)[names(All_Years) == "Sub_metering_1"] <- "Kitchen"
names(All_Years)[names(All_Years) == "Sub_metering_2"] <- "Laundry"
names(All_Years)[names(All_Years) == "Sub_metering_3"] <- "AC_Heater"
#Gather all sub_meterings in two attributes (Submetering - Value)
All_Years_Comb_Submeterings <- All_Years %>% gather(Sub_metering, Value, Kitchen:AC_Heater)
#EXPLORATION OF THE DATA
#How many different values we have for the total sub_meterings?
Values_n <- All_Years_Comb_Submeterings %>% group_by(Value) %>% summarise (n = n()) %>% mutate(freq = n / sum(n))
#There are 88 different values.
#Can we convert those values as Factors?
All_Years_Comb_Submeterings$Value = as.factor(All_Years_Comb_Submeterings$Value)
#Visualize the distribution of the Values
Values_n_plot <- ggplot(data=Values_n) + geom_col(mapping = aes(x=Value, y=n))
Values_n_plot %>% ggplotly()
#If we group the values we will find a normal distribution for each group:
#[0, 1, 2], [17, 18, 19], [36, 37,38, 39]
#[11,12,13], [28,29,30]
#That means that we have settled values and not much oscillation.
#Now let's try to find out from which sub_metering those peaks belong.
#Histogram of Values for each sub_metering
Plot_All_Years_Comb_Submeterings <- ggplot(All_Years_Comb_Submeterings, aes(Value, fill = Sub_metering)) + geom_bar()
Plot_All_Years_Comb_Submeterings %>% ggplotly()
#Histograms of the three submetertings
hist(All_Years$Kitchen)

hist(All_Years$Laundry)

hist(All_Years$AC_Heater)

#BoxPlots of the three submeterings
boxplot(Kitchen~year,data=All_Years)

boxplot(Laundry~year,data=All_Years)

boxplot(AC_Heater~year,data=All_Years)

ggplot(All_Years_Comb_Submeterings, aes(Sub_metering, Value )) + geom_boxplot() + coord_flip() + facet_grid(.~year)

#Statistics
summary(All_Years)
## DateTime Date Time
## Min. :2007-01-01 00:00:00 Length:2027288 Length:2027288
## 1st Qu.:2007-12-21 16:32:45 Class :character Class :character
## Median :2008-12-07 16:38:30 Mode :character Mode :character
## Mean :2008-12-09 17:30:05
## 3rd Qu.:2009-11-27 16:09:15
## Max. :2010-11-26 21:02:00
## Kitchen Laundry AC_Heater year
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :2007
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:2007
## Median : 0.000 Median : 0.000 Median : 1.000 Median :2008
## Mean : 1.121 Mean : 1.289 Mean : 6.448 Mean :2008
## 3rd Qu.: 0.000 3rd Qu.: 1.000 3rd Qu.:17.000 3rd Qu.:2009
## Max. :88.000 Max. :80.000 Max. :31.000 Max. :2010
## month weekday day hour
## Min. : 1.000 Length:2027288 Min. : 1.00 Min. : 0.0
## 1st Qu.: 3.000 Class :character 1st Qu.: 8.00 1st Qu.: 5.0
## Median : 6.000 Mode :character Median :16.00 Median :12.0
## Mean : 6.394 Mean :15.62 Mean :11.5
## 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:18.0
## Max. :12.000 Max. :31.00 Max. :23.0
## minute
## Min. : 0.0
## 1st Qu.:15.0
## Median :30.0
## Mean :29.5
## 3rd Qu.:44.0
## Max. :59.0
describe(All_Years)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## vars n mean sd min max range se
## DateTime 1 2027288 NaN NA Inf -Inf -Inf NA
## Date 2 2027288 NaN NA Inf -Inf -Inf NA
## Time 3 2027288 NaN NA Inf -Inf -Inf NA
## Kitchen 4 2027288 1.12 6.15 0 88 88 0.00
## Laundry 5 2027288 1.29 5.79 0 80 80 0.00
## AC_Heater 6 2027288 6.45 8.43 0 31 31 0.01
## year 7 2027288 2008.45 1.10 2007 2010 3 0.00
## month 8 2027288 6.39 3.39 1 12 11 0.00
## weekday 9 2027288 NaN NA Inf -Inf -Inf NA
## day 10 2027288 15.62 8.80 1 31 30 0.01
## hour 11 2027288 11.50 6.92 0 23 23 0.00
## minute 12 2027288 29.50 17.32 0 59 59 0.01
#We see that we have a lot of 0's. Which is the influence of the low values in the total? In terms of mean?
#REEEMOVEE
#What if i remove those low values?
All_Years_Greater_2 <- All_Years %>% filter(Kitchen>2, Laundry>2, AC_Heater>2)
summary(All_Years_Greater_2)
## DateTime Date Time
## Min. :2007-01-07 13:15:00 Length:4517 Length:4517
## 1st Qu.:2007-08-08 11:01:00 Class :character Class :character
## Median :2008-05-06 15:46:00 Mode :character Mode :character
## Mean :2008-07-30 20:36:25
## 3rd Qu.:2009-05-16 14:37:00
## Max. :2010-11-22 10:19:00
## Kitchen Laundry AC_Heater year
## Min. : 3.00 Min. : 3.00 Min. : 3.0 Min. :2007
## 1st Qu.:32.00 1st Qu.:15.00 1st Qu.:17.0 1st Qu.:2007
## Median :37.00 Median :31.00 Median :17.0 Median :2008
## Mean :32.73 Mean :27.69 Mean :17.3 Mean :2008
## 3rd Qu.:38.00 3rd Qu.:36.00 3rd Qu.:18.0 3rd Qu.:2009
## Max. :81.00 Max. :76.00 Max. :30.0 Max. :2010
## month weekday day hour
## Min. : 1.000 Length:4517 Min. : 1.00 Min. : 0.00
## 1st Qu.: 2.000 Class :character 1st Qu.: 8.00 1st Qu.:12.00
## Median : 5.000 Mode :character Median :16.00 Median :14.00
## Mean : 5.648 Mean :15.67 Mean :15.05
## 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:19.00
## Max. :12.000 Max. :31.00 Max. :23.00
## minute
## Min. : 0.00
## 1st Qu.:15.00
## Median :30.00
## Mean :29.71
## 3rd Qu.:44.00
## Max. :59.00
describe(All_Years_Greater_2)
## Warning in describe(All_Years_Greater_2): NAs introduced by coercion
## Warning in describe(All_Years_Greater_2): NAs introduced by coercion
## Warning in describe(All_Years_Greater_2): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
## vars n mean sd median trimmed mad min max range
## DateTime 1 4517 NaN NA NA NaN NA Inf -Inf -Inf
## Date* 2 4517 NaN NA NA NaN NA Inf -Inf -Inf
## Time* 3 4517 NaN NA NA NaN NA Inf -Inf -Inf
## Kitchen 4 4517 32.73 10.73 37 34.27 1.48 3 81 78
## Laundry 5 4517 27.69 17.24 31 26.53 10.38 3 76 73
## AC_Heater 6 4517 17.30 1.97 17 17.26 1.48 3 30 27
## year 7 4517 2008.15 1.07 2008 2008.06 1.48 2007 2010 3
## month 8 4517 5.65 3.82 5 5.45 4.45 1 12 11
## weekday* 9 4517 NaN NA NA NaN NA Inf -Inf -Inf
## day 10 4517 15.67 8.46 16 15.62 10.38 1 31 30
## hour 11 4517 15.05 4.47 14 15.19 4.45 0 23 23
## minute 12 4517 29.71 16.84 30 29.77 20.76 0 59 59
## skew kurtosis se
## DateTime NA NA NA
## Date* NA NA NA
## Time* NA NA NA
## Kitchen -0.70 2.72 0.16
## Laundry 0.49 0.40 0.26
## AC_Heater 1.01 21.30 0.03
## year 0.41 -1.14 0.02
## month 0.36 -1.30 0.06
## weekday* NA NA NA
## day 0.00 -1.13 0.13
## hour -0.39 0.14 0.07
## minute -0.04 -1.14 0.25
#Are there any NAs?
summary(is.na(All_Years))
## DateTime Date Time Kitchen
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
## Laundry AC_Heater year month
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
## weekday day hour minute
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
#EXPLORATION OF THE DATA. PLOTS
# Sum
All_Years %>% group_by(year, hour) %>% summarise(Total_Kitchen=sum(Kitchen), Total_Laundry=sum(Laundry), Total_AC_Heater=sum(AC_Heater)) %>% ggplot(aes(x = hour)) +
geom_line(aes(y = Total_Kitchen, col = "Total_Kitchen")) + geom_point((aes(y= Total_Kitchen, col = "Total_Kitchen"))) +
geom_line(aes(y = Total_Laundry, col = "Total_Laundry")) + geom_point((aes(y= Total_Laundry, col = "Total_Laundry"))) +
geom_line(aes(y = Total_AC_Heater, col = "Total_AC_Heater")) +
geom_point((aes(y= Total_AC_Heater, col = "Total_AC_Heater"))) +
scale_x_continuous(breaks = c(1:24), labels = c(1:24), limits = c(1,24)) +
ylab("Energy consumtion") + facet_wrap(~year, scales = "free_x")
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).

#Mean
All_years_mean_by_year_hour <- All_Years %>% group_by(year, hour) %>% summarise(Mean_Kitchen=mean(Kitchen), Mean_Laundry=mean(Laundry), Mean_AC_Heater=mean(AC_Heater)) %>% ggplot(aes(x = hour)) +
geom_line(aes(y = Mean_Kitchen, col = "Mean_Kitchen")) + geom_point((aes(y= Mean_Kitchen, col = "Mean_Kitchen"))) +
geom_line(aes(y = Mean_Laundry, col = "Mean_Laundry")) + geom_point((aes(y= Mean_Laundry, col = "Mean_Laundry"))) +
geom_line(aes(y = Mean_AC_Heater, col = "Mean_AC_Heater")) +
geom_point((aes(y= Mean_AC_Heater, col = "Mean_AC_Heater"))) +
scale_x_continuous(breaks = c(1:24), labels = c(1:24), limits = c(1,24)) +
ylab("Energy consumtion") + facet_wrap(~year, scales = "free_x")
#Plot the mean of submeterings by year. Exclude watts below 2
All_Years %>% group_by(year, hour) %>% filter(Kitchen>2, Laundry>2, AC_Heater>2) %>% summarise(Mean_Kitchen=mean(Kitchen), Mean_Laundry=mean(Laundry), Mean_AC_Heater=mean(AC_Heater)) %>% ggplot( aes(x = hour)) +
geom_line(aes(y = Mean_Kitchen, col = "Mean_Kitchen")) + geom_point((aes(y= Mean_Kitchen, col = "Mean_Kitchen"))) +
geom_line(aes(y = Mean_Laundry, col = "Mean_Laundry")) + geom_point((aes(y= Mean_Laundry, col = "Mean_Laundry"))) +
geom_line(aes(y = Mean_AC_Heater, col = "Mean_AC_Heater")) +
geom_point((aes(y= Mean_AC_Heater, col = "Mean_AC_Heater"))) +
scale_x_continuous(breaks = c(1:24), labels = c(1:24), limits = c(1,24)) +
ylab("Energy consumtion") + facet_wrap(~year, scales = "free_x")
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).

#?
All_Years_Comb_Submeterings %>% group_by(day) %>%ggplot(aes(x=day,y=Value)) + geom_col()

#MISSING ROWS
All_Years_PAD <- All_Years
All_Years_PAD$DateTime %>% get_interval()
## [1] "min"
#missing rows by week = 0 and by month = 0
All_Years_PAD_month <- All_Years_PAD %>% select(DateTime, Kitchen) %>% thicken('month') %>% group_by(DateTime_month) %>% summarise(Kitchen_No = sum(Kitchen)) %>% pad()
## pad applied on the interval: month
summary(is.na(All_Years_PAD_month)) #but we know that there is a missing month, which is december 2010
## DateTime_month Kitchen_No
## Mode :logical Mode :logical
## FALSE:47 FALSE:47
#missing rows by day = 9
All_Years_PAD_day <- All_Years_PAD %>% select(DateTime, Kitchen) %>% thicken('day') %>% group_by(DateTime_day) %>% summarise(Kitchen_No = sum(Kitchen)) %>% pad()
## pad applied on the interval: day
summary(is.na(All_Years_PAD_day))
## DateTime_day Kitchen_No
## Mode :logical Mode :logical
## FALSE:1426 FALSE:1417
## TRUE :9
#missing rows by hour = 421
All_Years_PAD_hour <- All_Years_PAD %>% select(DateTime, Kitchen) %>% thicken('hour') %>% group_by(DateTime_hour) %>% summarise(Kitchen_No = sum(Kitchen)) %>% pad()
## pad applied on the interval: hour
summary(is.na(All_Years_PAD_hour))
## DateTime_hour Kitchen_No
## Mode :logical Mode :logical
## FALSE:34222 FALSE:33801
## TRUE :421
#missing rows by minute = 25975
All_Years_PAD_min <- All_Years_PAD %>% pad(by="DateTime", break_above = 3)
## pad applied on the interval: min
summary(is.na(All_Years_PAD_min))
## DateTime Date Time Kitchen
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2053263 FALSE:2027288 FALSE:2027288 FALSE:2027288
## TRUE :25975 TRUE :25975 TRUE :25975
## Laundry AC_Heater year month
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
## TRUE :25975 TRUE :25975 TRUE :25975 TRUE :25975
## weekday day hour minute
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:2027288 FALSE:2027288 FALSE:2027288 FALSE:2027288
## TRUE :25975 TRUE :25975 TRUE :25975 TRUE :25975
#Plot missing values with imputeTS package
#min
as.ts(All_Years_PAD_min$Kitchen) %>% plotNA.distribution()

#hours
as.ts(All_Years_PAD_hour$Kitchen_No) %>% plotNA.distribution()

#days
as.ts(All_Years_PAD_day$Kitchen_No) %>% plotNA.distribution()

#All_Years_PAD %>% select(DateTime, Kitchen) %>% thicken('hour') %>% group_by(DateTime_hour) %>% pad(by="DateTime")
plotNA.distribution(as.ts(All_Years_PAD_hour$Kitchen_No))

#All_Years_PAD_hour_2 <- All_Years_PAD %>% thicken('hour') %>% pad(by="DateTime_hour", break_above = 3)
# summary(is.na(All_Years_PAD_hour_2))
#
# All_Years_PAD_hour_2 %>% group_by(year) %>% plotNA.distribution()
#
# plotNA.distribution(All_Years_PAD_hour_2$Kitchen)
#
as.ts(All_Years_PAD_hour$Kitchen_No) %>% plotNA.distribution( cexPoints = 2)

#Visualizing a single day (09/01/2008)
houseDay <- filter(All_Years, year == 2008 & month == 1 & day == 9)
## Plot the kitchen
plot_ly(houseDay, x = ~houseDay$DateTime, y = ~houseDay$Kitchen, type = 'scatter', mode = 'lines')
## Plot sub-meter 1, 2 and 3 with title, legend and labels - All observations
plot_ly(houseDay, x = ~houseDay$DateTime, y = ~houseDay$Kitchen, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay$Laundry, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay$AC_Heater, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
#Reducing granularity
## Subset the 9th day of January 2008 - 10 Minute frequency
houseDay10min <- filter(All_Years, year == 2008 & month == 1 & day == 9 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseDay10min, x = ~houseDay10min$DateTime, y = ~houseDay10min$Kitchen, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10min$Laundry, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10min$AC_Heater, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))